from IPython.display import Image
Image(filename='imgs/banner.png')
No description has been provided for this image
%load_ext pretty_jupyter
import pendulum

Monday, 20 October 2025 at 07:55 PM (PDT)

%%html

<style>
    #Styling {
        font-weight: bold;
        font-family: Helvetica;
    }
</style>

goal


Notes about the notebook.

setup data

# dependencies
import sys
import re
import pandas as pd
from ipywidgets import HTML

sys.path.append(".")
import Summary
import Relative_risk
import Chi_square
# support methods
def format_count(v):
    return "{:,}".format(v)


def format_prop(prop, decn=1, asperc=True):
    if asperc: prop = prop*100
    return "{}%".format(round(prop, decn))


def formatprop_num(num, den, aspercent, dec):
    if num == 0: return 0
    v = num/den
    if not aspercent: out = round(v, dec)
    else: out = round(v * 100, dec)
    return out


def formatprop_str(num, den, aspercent, dec):
    if num == 0: return '0'
    v = num/den
    if not aspercent: out = f"%.{dec}f" % v
    else: out = f"%.{dec}f" % (v*100)
    if int(float(out)) == 0: return 'less than 1'
    return out


def formatprop(num, den, numeric, aspercent=False, dec=1):
    assert (den >= num)
    if numeric: out = formatprop_num(num=num, den=den, aspercent=aspercent, dec=dec)
    else: out = formatprop_str(num=num, den=den, aspercent=aspercent, dec=dec)
    return out


def report_fields(df, idcol, cols, fillval='NO DATA'):
    data = df[[idcol] + cols].drop_duplicates()
    if fillval:
        count = data[cols].fillna(fillval).value_counts().to_frame().reset_index()
        perc = data[cols].fillna(fillval).value_counts(normalize=True).to_frame().reset_index(
            ).rename(columns={'proportion': 'percent'})
    else:
        count = data[cols].value_counts().to_frame().reset_index()
        perc = data[cols].value_counts(normalize=True).to_frame().reset_index(
            ).rename(columns={'proportion': 'percent'})
    count['count'] = count['count'].apply(lambda x: f"{x:,}")
    perc.percent = perc.percent.apply(lambda x: f"{x*100:.2f}%")
    out = pd.merge(count, perc, on=cols)
    return out


def get_census():
    # county numbers from USCB DHC https://data.census.gov/table/DECENNIALDHC2020.P9?t=Race%20and%20Ethnicity&g=050XX00US06075
    magic = {}
    magic['county'] = {
        'total': {'n': 873965},
        'Black': {'n': 45071}, # 'Black or African American alone'
        'White': {'n': 341306}, # 'White alone'
        'Latine': {'n': 136761}, # 'Hispanic or Latino'
        'Asian': {'n': 294220 + 3244 + 1570}, # 'Asian alone' + 'Native Hawaiian and Other Pacific Islander alone' + 'American Indian and Alaska Native alone'
        'Other/Unknown': {'n': 6347 + 45446}, # 'Some Other Race alone' + 'Population of two or more races'
    }
    realgroups = [k for k in magic['county'].keys() if k != 'total']
    assert magic['county']['total']['n'] == sum([magic['county'][group]['n'] for group in realgroups]), f"\
    Census counts by race group should add up to the total population count.\
    Found {magic['county']['total']['n']} for total population and {sum([magic['county'][group]['n'] for group in realgroups])} group total."
    for race_group in realgroups:
        magic['county'][race_group]['prop'] = formatprop(
            num=magic['county'][race_group]['n'],
            den=magic['county']['total']['n'],
            numeric=True, aspercent=False, dec=5)
    for race_group in realgroups:
        magic['county'][race_group]['perc'] = formatprop(
            num=magic['county'][race_group]['n'],
            den=magic['county']['total']['n'],
            numeric=False, aspercent=True, dec=1)
    return magic
# main
"""This is the processed version of publicly available data requested by the ACLU from the San Francisco District Attorney's Office ("SFDA") covering 2015-2022. Note that the SFDA did not necessarily produce the data themselves and it may also reflect SFPD or SF Sheriff entries."""
sfda = pd.read_parquet("sfda.parquet")
magic = get_census()
# support for breaking down data
cols = {
    'meta': [
        'source', 'filename', 'sheet', 'incident_number', 'court_number',],
    'person': [
        'age_at_arrest', 'gender', 'race', 'ethnicity', 'ethnicity_group',],
    'booking': [
        'incident_number', 'arrest_date', 'age_at_arrest',
        'booked_case_type', 'booked_charge_list',],
    'filing': [
        'court_number', 'filing_date',
        'filed_case_type', 'filed_charge_list', 'description',
        'case_dispo_date', 'case_dispo', 'dispo_description', 'dispo_description_group',
        'status_ctnum', 'status_ctnum_agg', 'status_ctnum_group',],
}
grouped = [c for collist in cols.values() for c in collist]
lost = [c for c in sfda.columns if c not in grouped]
assert not any(lost), f"\
previously had grouped all available columns, however {
lost} have not been grouped."

preview data

Sample metadata.
75254
source ACLU_PRA
filename 2015-2022_San Francisco County_Proseuction Data (1).xlsx
sheet Arrests & DA Actions|Cases Filed
incident_number Z20201005-20010789
court_number 20010789
Sample defendant data.
73547
age_at_arrest 27.0
gender Male
race Black
ethnicity Black or African American
ethnicity_group Black
Sample booking data.
4868
incident_number Z20171212-02506701
arrest_date 2017-12-12 00:00:00
age_at_arrest NaN
booked_case_type None
booked_charge_list None
Sample filing data.
49450
court_number 18008054
filing_date NaT
filed_case_type None
filed_charge_list None
description Assault
case_dispo_date NaT
case_dispo NaN
dispo_description None
dispo_description_group None
status_ctnum None
status_ctnum_agg Discharged w/o further action
status_ctnum_group None

coverage

This dataset includes:

  • arrests (presumably by SFPD) between January 11, 2011 and December 31, 2021.
  • cases filed by the SFDA between January 2, 2015 and December 30, 2021.

Note that these data are the result of a PRA request to the SFDA and so are more accurately a reflection of the cases filed between 2015-2021, which happen to cover some arrests going back to 2011.

more about the filing dates

statistic filing_date
0 count 46613
1 mean 2018-05-09 18:49:05.770493184
2 min 2015-01-02 00:00:00
3 25% 2016-09-12 00:00:00
4 50% 2018-05-21 00:00:00
5 75% 2019-10-31 00:00:00
6 max 2021-12-30 00:00:00
assert sfda.filename.str.contains('San Francisco County').all()

setting up labels

LABELS are plain text phrases that explain what a unit of a variable represents in this context. For example, we often describe a case_filed value as 'a case against a defendant'.

These phrases are laced into the analytical template for presenting findings in plain text, so pay close attention to how the sentences read in the results to confirm it makes sense and appears correct.

# add case-based indicators
sfda['case_filed'] = sfda.court_number.notna()
sfda['conviction_any'] = sfda.dispo_description_group.isin((
    'imprisonment', 'jail_probation', 'other_conviction'))

# add charge-based indicators
pcs = {
    148: '148[A-Z]*|148\\.',
    211: '211',
}
for code, patt in pcs.items():
    for stage in ('booked', 'filed'): # note lack of a `convicted_charge_list` field
        chargecol = f'{stage}_charge_list'
        sfda[f'pc_{code}_{stage}'] = sfda[chargecol].str.contains(patt, na=False, flags=re.I)
sfda['booked_148_only'] = sfda.pc_148_booked & sfda.booked_charge_list.apply(
    lambda x: len(x.split(',')) == 1 if x else False)
LABELS = {
    'ethnicity_group': 'someone with a recorded race/ethnicity of',
    'def_black': 'a Black defendant',
    'def_white': 'a White defendant',
    'case_filed': 'a case against a defendant',
    'conviction_any': 'conviction on at least one charge', # the data are not more specific than this
    'pc_148_booked': 'a booked charge for resisting arrest',
    'booked_148_only': 'arrest(s) solely for resisting arrest',
    'pc_148_filed': 'a filed charge for resisting arrest',
    'pc_211_booked': 'a booked charge for robbery',
    'pc_211_filed': 'a filed charge for robbery',
}

sample charge indicator: variation of pc_148_booked

I filtered for cases in which the sole booked charge was PC 148, resisting arrest.

default value_counts() presentation

pc_148_booked count
0 False 76840
1 True 8243
booked_148_only count
0 False 84283
1 True 800

Summary module

background

Summary.Summary.__init__
<function Summary.Summary.__init__(self, df, params, labels)>

Calculation:

  • Table: df[[INDICATOR_COL, GROUP_COL]].groupby(GROUP_COL)[INDICATOR_COL].sum()
  • Description: Summarize by {GROUP_COL} all records where {INDICATOR_COL} is True Present:
  • Count/Percent: '{GROUP_COL.sum()/INDICATOR_COL.sum()*100}% ({GROUP_COL.sum()} of {INDICATOR_COL.sum()})'
  • Finding:
    • Of the {INDICATOR_COL.sum()},
      • {magic['GROUP_COUNTS'][GROUP_LABEL]} were for {GROUP_LABEL}
      • (repeated for each group appearing in GROUP_COL)
Summary.PARAMS
['INDICATOR_COL', 'INDICATOR_OP', 'GROUP_COL', 'RENAMER']

setup

pc148_sum = Summary.Summary(
    df=sfda,
    params={
        'INDICATOR_COL': 'booked_148_only',
        'INDICATOR_OP': 'are of',
        'GROUP_COL': 'ethnicity_group',
        'RENAMER': {
            'ethnicity_group': 'Race/Ethnicity',
            True: 'PC 148 alone',
            False: 'Other charge(s)'
        }
    },
    labels=LABELS,
)

text based summary: distribution across groups

Of the 800 arrest(s) solely for resisting arrest,

  • 48 or 6.0% are of someone with a recorded race/ethnicity of Asian.
  • 304 or 38.0% are of someone with a recorded race/ethnicity of Black.
  • 170 or 21.2% are of someone with a recorded race/ethnicity of Latine.
  • 22 or 2.8% are of someone with a recorded race/ethnicity of Other/Unknown.
  • 256 or 32.0% are of someone with a recorded race/ethnicity of White.

magic numbers: distribution across groups

pc148_sum_gc = pc148_sum.getmagic()['GROUP_COUNTS'].copy()
pc148_sum.getmagic()
{'INDICATOR_COL': 'booked_148_only',
 'INDICATOR_OP': 'are of',
 'GROUP_COL': 'ethnicity_group',
 'RENAMER': {'ethnicity_group': 'Race/Ethnicity',
  True: 'PC 148 alone',
  False: 'Other charge(s)'},
 'INDICATOR_COUNT': np.int64(800),
 'GROUP_COUNTS': {'Asian': 48,
  'Black': 304,
  'Latine': 170,
  'Other/Unknown': 22,
  'White': 256},
 'GROUP_PERCENTS': {'Asian': 6.0,
  'Black': 38.0,
  'Latine': 21.25,
  'Other/Unknown': 2.75,
  'White': 32.0}}

table: distribution across groups

Of the 85,083 cases considered, there are 800 arrest(s) solely for resisting arrest, with the following distribution:

Asian Black Latine Other/Unknown White Total
0 Other charge(s) 6620 (7.9%) 30337 (36.0%) 18386 (21.8%) 6841 (8.1%) 22099 (26.2%) 84283
1 PC 148 alone 48 (6.0%) 304 (38.0%) 170 (21.2%) 22 (2.8%) 256 (32.0%) 800
2 Total 6668 (7.8%) 30641 (36.0%) 18556 (21.8%) 6863 (8.1%) 22355 (26.3%) 85083

table: distribution within groups

Of the 85,083 cases considered, there are 800 arrest(s) solely for resisting arrest, with the following distribution:

Race/Ethnicity Other charge(s) PC 148 alone Total
0 Asian 6620 (99.3%) 48 (0.7%) 6668
1 Black 30337 (99.0%) 304 (1.0%) 30641
2 Latine 18386 (99.1%) 170 (0.9%) 18556
3 Other/Unknown 6841 (99.7%) 22 (0.3%) 6863
4 White 22099 (98.9%) 256 (1.1%) 22355
5 Total 84283 (99.1%) 800 (0.9%) 85083

Relative_risk module

For the contingency table to work, you need to setup a supplemental variable that is a version of the race/ethnicity field for each comparison group (ie. Black vs. White).

You could avoid setting up this variable by providing a filtered df that only includes records related to the comparison, as long as your comparison only involves groups in the data (ie. not comparing to non-Black defendants when you don't have that label encoded in the group variable).

background

Relative_risk.Contingency.__init__
<function Relative_risk.Contingency.__init__(self, df, params, labels)>

Calculation:

  • Table:
  • Description: Present:
  • Contingency table:
  • Finding:
Relative_risk.CONTINGENCY_PARAMS
['COMPARISON_GROUP_COL', 'OUTCOME_EVENT_COL', 'GIVEN_EVENT_COL']
Relative_risk.RATIO_PARAMS
['TREAT_GROUP_COL',
 'CONTROL_GROUP_COL',
 'OUTCOME_EVENT_OP',
 'OUTCOME_EVENT_COL',
 'GIVEN_EVENT_COL']

setup

sfda['def_black'] = sfda.ethnicity_group == 'Black'
sfda['def_white'] = sfda.ethnicity_group == 'White'
sfda['def_black_or_hispanic'] = sfda.ethnicity_group.isin(('Black', 'Latine'))
sfda['def_not_black'] = ~sfda.def_black

# setup comparison groups based on race data
sfda.loc[(sfda.def_black) | (sfda.def_white), 'comparison_group_1'] = sfda.ethnicity_group
sfda.loc[sfda.def_black, 'comparison_group_2'] = sfda.ethnicity_group
sfda.loc[sfda.def_not_black, 'comparison_group_2'] = 'Race other than Black'
sfda.loc[sfda.def_white, 'comparison_group_3'] = sfda.ethnicity_group
sfda.loc[sfda.def_black_or_hispanic, 'comparison_group_3'] = 'Black or Hispanic'
pc148_rr_tbl = Relative_risk.Contingency(
    df=sfda,
    params={
        'COMPARISON_GROUP_COL': 'comparison_group_1',
        'OUTCOME_EVENT_COL': 'pc_148_filed',
        'GIVEN_EVENT_COL': 'booked_148_only',
    },
    labels=LABELS,
)
pc148_rr_sum = Relative_risk.Ratio(
    df=sfda,
    params={
        'TREAT_GROUP_COL': 'def_black',
        'CONTROL_GROUP_COL': 'def_white',
        'OUTCOME_EVENT_OP': 'faces',
        'OUTCOME_EVENT_COL': 'pc_148_filed',
        'GIVEN_EVENT_COL': 'booked_148_only',
    },
    labels=LABELS,
)

a filed charge for resisting arrest Black White Total
0 True 10 15 25
1 False 294 241 535
2 Total 304 256 560

The ratio of the probability that a Black defendant faces a filed charge for resisting arrest compared to the probability that a White defendant faces a filed charge for resisting arrest is 0.561.

In other words, the relative risk that a Black defendant faces a filed charge for resisting arrest is 43.9% less than a White defendant.

Chi_square module

This module has been setup to use the numbers exported from the Summary module.

background

Chi_square.DEFAULTS
{'SIG': 0.05, 'DDOF': 0}
Chi_square.CENSUS_PARAMS
['CENSUS_DICT', 'OBSERVED_DICT', 'NULL_PHRASE', 'SIG', 'DDOF']
Chi_square.EQUAL_PARAMS
['OBSERVED_DICT', 'NULL_PHRASE', 'SIG', 'DDOF']
params = {
    'Census': {
        'OBSERVED_DICT': pc148_sum_gc,
        'CENSUS_DICT': {k:v for k,v in magic['county'].items() if k != 'total'},
        'NULL_PHRASE': f"the distribution by race of {
            LABELS['booked_148_only']} follows the distribution of the general population"
    },
    'Equal': {
        'OBSERVED_DICT': pc148_sum_gc,
        'NULL_PHRASE': f"the distribution by race of {
            LABELS['booked_148_only']} is equal across all groups"
    },
}

Comparing racial distribution to that of the census

This test compares whether observed racial proportions match the proportion of each racial group in the general population. In interpreting the results, a p-value below 0.05 will be considered statistically significant.

This test results in a p-value of < 0.0001, which is a statistically significant difference and rejects the null hypothesis that the distribution by race of arrest(s) solely for resisting arrest follows the distribution of the general population.

Comparing racial distribution to if each group were treated equally

This test compares whether observed racial proportions match the proportion of each racial group in the general population. In interpreting the results, a p-value below 0.05 will be considered statistically significant.

This test results in a p-value of < 0.0001, which is a statistically significant difference and rejects the null hypothesis that the distribution by race of arrest(s) solely for resisting arrest is equal across all groups.